#!/usr/bin/env python3


import urllib
import pickle
import pandas as pd
import string
import numpy as np
from nltk.corpus import stopwords
import math
import csv
import re
import random


###############################################################
###### GLOBALS
###############################################################

##### Modify here as needed
dataPath = '/gscratch/comdata/users/kaylea/taboo/processed_data/'
#########

my_stop = ['term', 'used', 'usually', 'particularly', 'etc', 'extremely', 'especially', 'one', 'en', 'something', 'often', 'synonym', 'like', 'etc.', 'person']                             
stop = my_stop + stopwords.words('english')

#coefsFile = str(dataPath + 'ngrams.tsv') ##coefs from model fit; split ngrams.txt as needed
coefsFile = str(dataPath + 'grammifiedData/ngramCount/xag.txt') ##coefs from model fit split; file was too large
pattern = re.compile("^redirect (.*)", re.IGNORECASE)

titlesFile = str(dataPath + 'enwiki_current_excerpt.tsv')
#allOut = str(dataPath + 'grammifiedData/salienceSample.tsv') ##all matches to coefsFile coefs will go here
allOut = str(dataPath + 'grammifiedData/splitFiles/salienceSampleXAG.tsv') ##all matches to coefsFile coefs will go here 

## read in coefs
coefDF = pd.read_csv(coefsFile, sep='\t', header=0)
print('Coef reading Done.')
titleDF = pd.read_csv(titlesFile, sep='\t', header=0)
print('Title reading Done.')

print(coefDF.head())
print(titleDF.head())
coefs = coefDF.ngram.to_list()
print(f"I have coefs, e.g. {coefs[10]}.")

titleDF['title'] = titleDF['encodedTitle'].apply(lambda x: urllib.parse.unquote(x))  
titleDF['title'] = titleDF['title'].apply(lambda x: x.strip('"'))
titleDF['filtered_title'] = titleDF['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
titleDF['filtered_title'] = titleDF['filtered_title'].str.lower()

salienceDF = pd.DataFrame()

for c in coefs: #Round 1: find any direct title matches to coefs
	matchDF = pd.DataFrame() #empty
	print(f'checking coef {c}')
	matchDF = titleDF[titleDF['filtered_title'] == c] #just matches
	#matchDF = matchDF.assign(coefOfOrigin= c)
	if not matchDF.empty:
		#print(f"Found a match, it's {matchDF}!")
		salienceDF = pd.concat([salienceDF, matchDF]) 
	else:
		#print(f"Found no matches for {c}!")
		continue
print(f"Done checking {len(coefs)} coefs. Remember that this file still contains redirects and tip your server.")
salienceDF.to_csv(allOut, mode='w', sep='\t', index=False)
